## This code extracts public record code to measure bankruptcy filings

# Load packages

import pyspark.sql.functions as sqlf
import pandas as pd
import numpy  as np
import os
import sys
import shutil
import time
from pyspark     import SparkContext, SparkConf
from pyspark.sql import SQLContext
from SparkDataTools import *

# Set spark working path
WORKDIR = '/geo_debt/geo/Spark'
CODEDIR = '/geo_debt/geo/Code/1_Spark/'
OUTDIR  = '/geo_debt/geo/Output/Table'

#go to owrking directory
os.chdir(WORKDIR)


#########################
###       Main        ###
#########################

yList = [2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016]
m = 6

for y in yList:

    ym = yyyymm(y,m)

    print(ym)

    headersData = table_setup(sqlContext, y, m, 'headers')
    headersData.createOrReplaceTempView("headersData")

    pubrecsData = table_setup(sqlContext, y, m, 'pubrecs')
    pubrecsData.createOrReplaceTempView("pubrecsData")

    sqlcode = "SELECT b.subjectKey, b.zip, a.asOfDate, a.filedDate, a.publicRecordCode " + \
              "FROM headersData b LEFT JOIN pubrecsData a ON b.subjectKey=a.subjectKey " + \
              "WHERE b.birthDate IS NOT NULL AND FLOOR((b.asOfDate - b.birthDate)/10000) BETWEEN 20 AND 80 "
    saveStata(sqlcode, 'Bkrptcy_BH_'+ym, WORKDIR, OUTDIR)

print(' !!!! SUCCESS !!!!')

